detach("package:dplyr")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(nycflights13)

数据

library(nycflights13)
dim(flights)
## [1] 336776     19
data(flights)

dplyr动作

  1. 取行:filter()
  2. 排序: arrange()
  3. 取列: select()
  4. 生成新变量: mutate()
  5. 分组汇总: group_by() %>% summarise()
  6. 动作连贯:%>%: 快捷键: ctrl + shift + M

filter()

dplyr::filter(flights, month ==1, day == 1)

base R 方法

flights[flights$month == 1 & flights$day == 1, ]

arrange()

arrange(flights, year, month, day)
arrange(flights, desc(arr_delay))

取列

select(flights, year, month, day)
select(flights, year:day)
select(flights, -(year:day))
flights %>% select(contains('dep'))

重命名

select(flights, tail_num = tailnum)
rename(flights, tail_num = tailnum)

增添新变量

mutate(flights,
  gain = arr_delay - dep_delay,
  speed = distance / air_time * 60
)
mutate(flights,
  gain = arr_delay - dep_delay,
  gain_per_hour = gain / (air_time / 60)
)
transmute(flights,
  gain = arr_delay - dep_delay,
  gain_per_hour = gain / (air_time / 60)
)

汇总

summarise(flights,
  delay = mean(dep_delay, na.rm = TRUE)
)

抽样

sample_n(flights, 10)
sample_frac(flights, 0.01)

分组

by_tailnum <- group_by(flights, tailnum)
delay <- summarise(by_tailnum,
  count = n(),
  dist = mean(distance, na.rm = TRUE),
  delay = mean(arr_delay, na.rm = TRUE))
delay <- dplyr::filter(delay, count > 20, dist < 2000)

使用连贯动作

delay <- flights %>% 
    group_by(tailnum) %>% 
    summarise(count = n(),
              dist = mean(distance, na.rm = TRUE),
              delay = mean(arr_delay, na.rm = TRUE)) %>% 
    filter(count > 20, dist < 2000)
delay
flights %>%
  group_by(year, month, day) %>%
  select(arr_delay, dep_delay) %>%
  summarise(
    arr = mean(arr_delay, na.rm = TRUE),
    dep = mean(dep_delay, na.rm = TRUE)
  ) %>%
  filter(arr > 30 | dep > 30)
## Adding missing grouping variables: `year`, `month`, `day`

多变量汇总

daily <- group_by(flights, year, month, day)
(per_day   <- summarise(daily, flights = n()))
(per_month <- summarise(per_day, flights = sum(flights)))
(per_year  <- summarise(per_month, flights = sum(flights)))

多表合并

flights2 <- flights %>% select(year:day, hour, origin, dest, tailnum, carrier)

flights2 %>% 
  left_join(airlines)
## Joining, by = "carrier"
data("weather")
weather
flights2 %>% left_join(weather)
## Joining, by = c("year", "month", "day", "hour", "origin")

用特定变量合并交集

planes
flights2 %>% left_join(planes, by = "tailnum")

合并变量名称不同?

airports
flights2 %>% left_join(airports, c("dest" = "faa"))
flights2 %>% left_join(airports, c("origin" = "faa"))

四种合并(join)方式

df1 <- tibble(x = c(1, 2), y = 2:1)
df2 <- tibble(x = c(1, 3), a = 10, b = "a")
df1
df2
  1. inner_join(x, y): xy交集
df1 %>% inner_join(df2)
## Joining, by = "x"
  1. left_join(x, y), 保留x的所有观测。最为常见
df1 %>% left_join(df2)
## Joining, by = "x"
  1. right_join(),保留y中的所有观测,等同于left_join(y, x),但列的排序稍有不同。
df1 %>% right_join(df2)
## Joining, by = "x"
df2 %>% left_join(df1)
## Joining, by = "x"
  1. full_join() 保留x, y所有观测,用NA填充。
df1 %>% full_join(df2)
## Joining, by = "x"

与观测有关的join

  1. semi_join(x, y): 保留y中有匹配的x观测值
  2. anti_join(x, y):将x中与y有匹配的数据全部丢弃
flights %>% 
  anti_join(planes, by = "tailnum") %>% 
  count(tailnum, sort = TRUE)
df1 <- tibble(x = c(1, 1, 3, 4), y = 1:4)
df2 <- tibble(x = c(1, 1, 2), z = c("a", "b", "a"))
df1
df2
df1 %>% nrow()
## [1] 4
df1 %>% inner_join(df2, by = "x")
df1 %>% semi_join(df2, by = "x")

集合操作

(df1 <- tibble(x = 1:2, y = c(1L, 1L)))
(df2 <- tibble(x = 1:2, y = 1:2))
intersect(df1, df2)
union(df1, df2)
setdiff(df1, df2)
setdiff(df2, df1)

窗口函数 (window functions)

与mutate和filter结合使用,窗口函数可以大有作为

library(Lahman)
batting <- Lahman::Batting %>%
  as_tibble() %>%
  select(playerID, yearID, teamID, G, AB:H) %>%
  arrange(playerID, yearID, teamID) %>%
  semi_join(Lahman::AwardsPlayers, by = "playerID")

players <- batting %>% group_by(playerID)
players
# For each player, find the two years with most hits
filter(players, min_rank(desc(H)) <= 2 & H > 0)
# Within each player, rank each year by the number of games played
mutate(players, G_rank = min_rank(G))
# For each player, find every year that was better than the previous year
filter(players, G > lag(G))
# For each player, compute avg change in games played per year
mutate(players, G_change = (G - lag(G)) / (yearID - lag(yearID)))
# For each player, find all where they played more games than average
filter(players, G > mean(G))
# For each, player compute a z score based on number of games played
mutate(players, G_z = (G - mean(G)) / sd(G))
  1. 排序和计算顺序:row_number(), min_rank(), dense_rank(), cume_dist(), percent_rank(), and ntile().
  2. 滞后和领先操作: lead() and lag()

排序

x <- c(1, 1, 2, 2, 2)

row_number(x)
## [1] 1 2 3 4 5
min_rank(x)
## [1] 1 1 3 3 3
dense_rank(x)
## [1] 1 1 2 2 2
cume_dist(x)
## [1] 0.4 0.4 1.0 1.0 1.0
# select the top 10% of records within each group
filter(players, cume_dist(desc(G)) < 0.1)
by_team_player <- group_by(batting, teamID, playerID)
by_team <- summarise(by_team_player, G = sum(G))
by_team_quartile <- group_by(by_team, quartile = ntile(G, 4))
summarise(by_team_quartile, mean(G))

领先滞后

x <- 1:5
lead(x)
## [1]  2  3  4  5 NA
lag(x)
## [1] NA  1  2  3  4
# Compute the relative change in games played
mutate(players, G_delta = G - lag(G))
# Find when a player changed teams
filter(players, teamID != lag(teamID))